# Impute missing data w/missForest - RandomForests for imputation
# Input: data from (1)
# Output: data.table, w/missing values imputed
rm(list = ls())
options(stringsAsFactors = FALSE)
seed_to_use <- 614
set.seed(seed_to_use)
library(data.table)
library(parallel)
library(missForest)
load("protests_econ_data.rdata")
# use only variables we need
d <- protests_econ_data[,.(cab_usd, gdp_pc, cab_pct_gdp,
  ext_trade_bal, year, country, e3, strikes, riots, antidemo, democ, 
  total_protests, balance_of_payment, gdp_2010, pop_count)]
# only data w/DV
d <- d[ year >= 1970 ]
# country-year sampling for imputation
d$country <- as.factor(d$country)
d$year <- as.factor(d$year)
# stratify sampling for imputation based on country/year
strata <- list(NULL, NULL, NULL, NULL, c(1L:34L), c(1L:17L), 
  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
d2 <- missForest(d, strata = strata)
# extract data frame
d3 <- d2$ximp
d3[, standardize_bop := 
    (balance_of_payment/gdp_2010), by = year]
d3[, std_cab_usd := (cab_usd - mean(cab_usd, na.rm = TRUE))/sd(cab_usd, na.rm = TRUE)]
# regional pct democracy
pct_dem <- d3[, sum(democ)/length(unique(country)), by = year]
pct_dem$year <- as.character(pct_dem$year)
d3$country <- as.character(d3$country)
setnames(pct_dem, c("year", "pct_dem"))
d3 <- merge(d3, pct_dem, by.x = "year", by.y = "year", all = TRUE)
d3[, gdp_pct_change := (
  gdp_2010 - shift(gdp_2010, n = 1))/shift(gdp_2010, n = 1), by = country]

# creating final lagged data
d3[,`:=` (
  lag1_standardize_bop = shift(standardize_bop, n = 1),
  lag1_std_cab_usd =  shift(std_cab_usd, n = 1),
  lag1_balance_of_payment = shift(balance_of_payment, n = 1),
  lag1_cab_usd = shift(cab_usd, n = 1), 
  lag1_cab_pct_gdp = shift(cab_pct_gdp, n = 1),
  lag1_gdp_2010 = shift(gdp_2010, n = 1), 
  lag1_pop_count = shift(pop_count, n = 1),
  lag1_e3 = shift(e3, n = 1),
  lag1_gdp_pc = shift(gdp_pc, n = 1),
  lag1_democ = shift(democ, n = 1),
  lag1_pct_dem = shift(pct_dem, n = 1),
  lag1_trade = shift(ext_trade_bal, n = 1),
  lag1_gdp_pct_change = shift(gdp_pct_change, n = 1),
  lag2_trade = shift(ext_trade_bal, n = 2),
  lag2_std_cab_usd =  shift(std_cab_usd, n = 2),
  lag2_standardize_bop = shift(standardize_bop, n = 2),
  lag2_balance_of_payment = shift(balance_of_payment, n = 2),
  lag2_cab_usd = shift(cab_usd, n = 2), 
  lag2_cab_pct_gdp = shift(cab_pct_gdp, n = 2),
  lag2_gdp_2010 = shift(gdp_2010, n = 2), 
  lag2_pop_count = shift(pop_count, n = 2),
  lag2_e3 = shift(e3, n = 2),
  lag2_democ = shift(democ, n = 2),
  lag2_pct_dem = shift(pct_dem, n = 2),
  lag2_gdp_pc = shift(gdp_pc, n = 2),
  lag2_gdp_pct_change = shift(gdp_pct_change, n = 2)
  ), by = country]
d3$country <- as.character(d3$country)
d3$year <- as.integer(as.character(d3$year))

imputed_protests <- d3
save(imputed_protests, 
  file = "imputed_protests.rdata")

